//
// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
//
// SPDX-License-Identifier: Apache-2.0
//

// Do not flag up inline assembly blocks
#pragma GCC diagnostic ignored "-Woverlength-strings"

#if !defined(__aarch64__) || !defined(__ARM_FEATURE_SVE2)
#error This file must be compiled for AArch64, FEAT_SVE2.
#else  // Architectural features check.

#include "kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.h"

#include <stddef.h>
#include <stdint.h>

#include "kai/kai_common.h"

static const size_t kai_m_step = 1;
static const size_t kai_n_step = 16;
static const size_t kai_nr = 2;
static const size_t kai_kr = 1;
static const size_t kai_sr = 1;

size_t kai_get_m_step_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(void) {
    return kai_m_step * kai_get_sme_vector_length_u32();
}

size_t kai_get_n_step_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(void) {
    return kai_n_step * kai_get_sme_vector_length_u32();
}

size_t kai_get_nr_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(void) {
    return kai_nr * kai_get_sme_vector_length_u32();
}

size_t kai_get_kr_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(void) {
    return kai_kr;
}

size_t kai_get_sr_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(void) {
    return kai_sr;
}

size_t kai_get_lhs_offset_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(size_t m_idx, size_t lhs_stride) {
    KAI_ASSUME(m_idx % kai_get_m_step_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla() == 0);

    return m_idx * lhs_stride;
}

size_t kai_get_rhs_packed_offset_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(size_t n_idx, size_t k) {
    KAI_ASSUME(n_idx % kai_get_n_step_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla() == 0);
    return n_idx * (k * sizeof(float) + sizeof(float));
}

size_t kai_get_dst_offset_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(
    size_t m_idx, size_t n_idx, size_t dst_stride) {
    KAI_ASSUME(m_idx % kai_get_m_step_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla() == 0);
    KAI_ASSUME(n_idx % kai_get_n_step_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla() == 0);

    return (m_idx * dst_stride) + (n_idx * sizeof(float));
}

size_t kai_get_dst_size_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(size_t m, size_t n) {
    return m * n * sizeof(float);
}

void kai_run_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(
    size_t m, size_t n, size_t k, const void* lhs, size_t lhs_stride, const void* rhs_packed, void* dst,
    size_t dst_stride_row, size_t dst_stride_col, float clamp_min, float clamp_max) {
    KAI_UNUSED(lhs_stride);
    KAI_UNUSED(dst_stride_row);
    KAI_UNUSED(dst_stride_col);

    KAI_ASSUME(m == 1);

    typedef struct {
        float maxval;
        float minval;
    } KernelArgs;

    KernelArgs ka;
    ka.maxval = clamp_max;
    ka.minval = clamp_min;

    size_t N = n;
    size_t K = k;

    const void* A_ptr = lhs;
    const void* B_ptr = rhs_packed;
    void* output_ptr = dst;

    uint64_t flags = 0;

    __asm__ __volatile__(
        ".inst 0xd503477f  // SMSTART ZA\n"
        "mov x8, #0x0\n"
        "mov x16, %x[B_ptr]\n"
        "cntw x15, ALL, MUL #4\n"
        "mov x14, %x[output_ptr]\n"
        "add x13, %x[N], x15\n"
        "ptrue p1.b\n"
        "sub x13, x13, #0x1\n"
        ".inst 0x25207811  // ptrue pn9.b\n"
        "udiv x13, x13, x15\n"
        "mov x22, #0x1\n"
        "add x21, x13, #0x3\n"
        "and x21, x21, #0xfffffffffffffffc\n"
        "mul x21, x21, x15\n"
        "mul x21, x21, %x[K]\n"
        "lsl x21, x21, #0x2\n"
        "1:"  // RHS size check loop
        "cmp x21, #0x200000\n"
        "blt 2f\n"
        "tbnz x21, #0, 3f\n"
        "lsr x21, x21, #0x1\n"
        "lsl x22, x22, #0x1\n"
        "b 1b\n"
        "2:"  // RHS do prefetch
        "lsl x20, x21, #0x26\n"
        "sub x22, x22, #0x1\n"
        "lsl x22, x22, #0x16\n"
        "orr x21, x21, x20\n"
        "orr x21, x21, x22\n"
        ".inst 0xf8b54a1a  // rprfm pldonce, x21, [x16]\n"
        "3:"  // RHS prefetch exit
        "mov x12, %x[K]\n"
        "cntw x20, ALL, MUL #2\n"
        "lsl x12, x12, #0x2\n"
        "add x12, x12, #0x4\n"
        "mul x12, x12, x20\n"
        "4:"  // Column loop
        "cmp x13, #0x4\n"
        "bge 22f\n"
        "cmp x13, #0x2\n"
        "bgt 16f\n"
        "beq 10f\n"
        "cntw x20, ALL, MUL #2\n"
        "add x22, x16, x12\n"
        ".inst 0xa0404614  // ld1w { z20.s-z21.s }, pn9.b/Z, [x16]\n"
        "cmp %x[N], x20\n"
        "mov x11, %x[K]\n"
        "csel x22, x22, x16, GT\n"
        "mov x21, %x[N]\n"
        ".inst 0xa04046d6  // ld1w { z22.s-z23.s }, pn9.b/Z, [x22]\n"
        "mov x10, %x[A_ptr]\n"
        "lsl x20, %x[K], #0x2\n"
        ".inst 0x25b567f0  // whilelt p8.s, XZR, x21, VLx4\n"
        "cmp x11, #0x4\n"
        ".inst 0xf8b44958  // rprfm pldmany, x20, [x10]\n"
        "addvl x16, x16, #2\n"
        "addvl x22, x22, #2\n"
        ".inst 0xc0040e80  // mova za.d[x8, #0], { z20.d-z23.d }\n"
        "ble 6f\n"
        "5:"  // Width 1: Multiply loop: Main loop head
        "whilelt p0.s, XZR, x11\n"
        ".inst 0xa0404605  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        "ld1rqw { z15.s }, p0/Z, [x10]\n"
        "sub x11, x11, #0x4\n"
        "add x10, x10, #0x10\n"
        ".inst 0xa04046c7  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        "cmp x11, #0x4\n"
        ".inst 0xa040461d  // ldnt1w { z28.s-z29.s }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04046df  // ldnt1w { z30.s-z31.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xc15f8080  // fmla za.s[x8, 0], { z4.s-z7.s }, z15.s[0]\n"
        ".inst 0xa0404601  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04046c3  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa0404615  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04046d7  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xc15f8780  // fmla za.s[x8, 0], { z28.s-z31.s }, z15.s[1]\n"
        ".inst 0xc15f8800  // fmla za.s[x8, 0], { z0.s-z3.s }, z15.s[2]\n"
        ".inst 0xc15f8e80  // fmla za.s[x8, 0], { z20.s-z23.s }, z15.s[3]\n"
        "bgt 5b\n"
        "6:"  // Width 1: Multiply loop: Single iteration only
        "whilelt p0.s, XZR, x11\n"
        ".inst 0xa0404601  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x1\n"
        "ld1rqw { z8.s }, p0/Z, [x10]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04046c3  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xc1588000  // fmla za.s[x8, 0], { z0.s-z3.s }, z8.s[0]\n"
        "ble 7f\n"
        ".inst 0xa0404611  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x1\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04046d3  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xc1588600  // fmla za.s[x8, 0], { z16.s-z19.s }, z8.s[1]\n"
        "ble 7f\n"
        ".inst 0xa0404615  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x1\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04046d7  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xc1588a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z8.s[2]\n"
        "ble 7f\n"
        ".inst 0xa040460d  // ldnt1w { z12.s-z13.s }, pn9.b/Z, [x16]\n"
        ".inst 0xa04046cf  // ldnt1w { z14.s-z15.s }, pn9.b/Z, [x22]\n"
        ".inst 0xc1588d80  // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s[3]\n"
        "7:"  // Width 1: Multiply loop: multiply skip
        "tbz %x[flags], #1, 8f\n"
        "add x21, %x[args_ptr], %[offset_min]\n"
        "add x20, %x[args_ptr], %[offset_max]\n"
        ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
        "ld1rw { z21.s }, p1/Z, [x21]\n"
        "ld1rw { z29.s }, p1/Z, [x20]\n"
        ".inst 0xc1bdcaa8  // fclamp { z8.s-z11.s }, z21.s, z29.s\n"
        ".inst 0xa060c1c8  // st1w { z8.s-z11.s }, p8, [x14]\n"
        "b 9f\n"
        "8:"  // Width 1: No activation
        ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
        ".inst 0xa060c1c8  // st1w { z8.s-z11.s }, p8, [x14]\n"
        "9:"  // Width 1: Output done
        "b 28f\n"
        "10:"  // Width 2
        "add x24, x16, x12, LSL #1\n"
        "cntw x20, ALL, MUL #6\n"
        ".inst 0xa0404604  // ld1w { z4.s-z5.s }, pn9.b/Z, [x16]\n"
        "add x23, x24, x12\n"
        "cmp %x[N], x20\n"
        ".inst 0xa0404700  // ld1w { z0.s-z1.s }, pn9.b/Z, [x24]\n"
        "add x22, x16, x12\n"
        "csel x23, x23, x16, GT\n"
        ".inst 0xa04046c6  // ld1w { z6.s-z7.s }, pn9.b/Z, [x22]\n"
        "mov x11, %x[K]\n"
        "sub x21, %x[N], x15\n"
        ".inst 0xa04046e2  // ld1w { z2.s-z3.s }, pn9.b/Z, [x23]\n"
        "mov x10, %x[A_ptr]\n"
        "lsl x20, %x[K], #0x2\n"
        ".inst 0x25b567f0  // whilelt p8.s, XZR, x21, VLx4\n"
        "cmp x11, #0x4\n"
        ".inst 0xf8b44958  // rprfm pldmany, x20, [x10]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xc0040c80  // mova za.d[x8, #0], { z4.d-z7.d }\n"
        "addvl x22, x22, #2\n"
        "addvl x24, x24, #2\n"
        ".inst 0xc0040c01  // mova za.d[x8, #1], { z0.d-z3.d }\n"
        "addvl x23, x23, #2\n"
        "ble 12f\n"
        "11:"  // Width 2: Multiply loop: Main loop head
        "whilelt p0.s, XZR, x11\n"
        ".inst 0xa0404605  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        "ld1rqw { z0.s }, p0/Z, [x10]\n"
        "sub x11, x11, #0x4\n"
        "add x10, x10, #0x10\n"
        ".inst 0xa04046c7  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        "cmp x11, #0x4\n"
        ".inst 0xa0404715  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa04046f7  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc1508080  // fmla za.s[x8, 0], { z4.s-z7.s }, z0.s[0]\n"
        ".inst 0xa0404619  // ldnt1w { z24.s-z25.s }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04046db  // ldnt1w { z26.s-z27.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xc1508281  // fmla za.s[x8, 1], { z20.s-z23.s }, z0.s[0]\n"
        ".inst 0xa0404709  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa04046eb  // ldnt1w { z10.s-z11.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc1508700  // fmla za.s[x8, 0], { z24.s-z27.s }, z0.s[1]\n"
        ".inst 0xa040461d  // ldnt1w { z28.s-z29.s }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04046df  // ldnt1w { z30.s-z31.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xc1508501  // fmla za.s[x8, 1], { z8.s-z11.s }, z0.s[1]\n"
        ".inst 0xa0404709  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa04046eb  // ldnt1w { z10.s-z11.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc1508b80  // fmla za.s[x8, 0], { z28.s-z31.s }, z0.s[2]\n"
        ".inst 0xa0404619  // ldnt1w { z24.s-z25.s }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04046db  // ldnt1w { z26.s-z27.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xc1508901  // fmla za.s[x8, 1], { z8.s-z11.s }, z0.s[2]\n"
        ".inst 0xa040470d  // ldnt1w { z12.s-z13.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa04046ef  // ldnt1w { z14.s-z15.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc1508f00  // fmla za.s[x8, 0], { z24.s-z27.s }, z0.s[3]\n"
        ".inst 0xc1508d81  // fmla za.s[x8, 1], { z12.s-z15.s }, z0.s[3]\n"
        "bgt 11b\n"
        "12:"  // Width 2: Multiply loop: Single iteration only
        "whilelt p0.s, XZR, x11\n"
        ".inst 0xa0404605  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x1\n"
        "ld1rqw { z8.s }, p0/Z, [x10]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04046c7  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa0404715  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa04046f7  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc1588080  // fmla za.s[x8, 0], { z4.s-z7.s }, z8.s[0]\n"
        ".inst 0xc1588281  // fmla za.s[x8, 1], { z20.s-z23.s }, z8.s[0]\n"
        "ble 13f\n"
        ".inst 0xa040460d  // ldnt1w { z12.s-z13.s }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x1\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04046cf  // ldnt1w { z14.s-z15.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa040471d  // ldnt1w { z28.s-z29.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa04046ff  // ldnt1w { z30.s-z31.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc1588580  // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s[1]\n"
        ".inst 0xc1588781  // fmla za.s[x8, 1], { z28.s-z31.s }, z8.s[1]\n"
        "ble 13f\n"
        ".inst 0xa040461d  // ldnt1w { z28.s-z29.s }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x1\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04046df  // ldnt1w { z30.s-z31.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa0404701  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa04046e3  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc1588b80  // fmla za.s[x8, 0], { z28.s-z31.s }, z8.s[2]\n"
        ".inst 0xc1588801  // fmla za.s[x8, 1], { z0.s-z3.s }, z8.s[2]\n"
        "ble 13f\n"
        ".inst 0xa0404615  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x16]\n"
        ".inst 0xa04046d7  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x22]\n"
        ".inst 0xa0404701  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x24]\n"
        ".inst 0xa04046e3  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x23]\n"
        ".inst 0xc1588e80  // fmla za.s[x8, 0], { z20.s-z23.s }, z8.s[3]\n"
        ".inst 0xc1588c01  // fmla za.s[x8, 1], { z0.s-z3.s }, z8.s[3]\n"
        "13:"  // Width 2: Multiply loop: multiply skip
        "tbz %x[flags], #1, 14f\n"
        "add x21, %x[args_ptr], %[offset_min]\n"
        "add x20, %x[args_ptr], %[offset_max]\n"
        ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
        ".inst 0xc0060c24  // mova { z4.d-z7.d }, za.d[x8, #1]\n"
        "ld1rw { z17.s }, p1/Z, [x21]\n"
        "ld1rw { z9.s }, p1/Z, [x20]\n"
        ".inst 0xc1a9ca3c  // fclamp { z28.s-z31.s }, z17.s, z9.s\n"
        ".inst 0xc1a9ca24  // fclamp { z4.s-z7.s }, z17.s, z9.s\n"
        ".inst 0xa060c5dc  // st1w { z28.s-z31.s }, pn9.b, [x14]\n"
        ".inst 0xa061c1c4  // st1w { z4.s-z7.s }, p8, [x14, #0x4, MUL VL]\n"
        "b 15f\n"
        "14:"  // Width 2: No activation
        ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
        ".inst 0xc0060c3c  // mova { z28.d-z31.d }, za.d[x8, #1]\n"
        ".inst 0xa060c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14]\n"
        ".inst 0xa061c1dc  // st1w { z28.s-z31.s }, p8, [x14, #0x4, MUL VL]\n"
        "15:"  // Width 2: Output done
        "b 28f\n"
        "16:"  // Width 3
        "add x26, x16, x12, LSL #2\n"
        "cntw x20, ALL, MUL #10\n"
        ".inst 0xa0404614  // ld1w { z20.s-z21.s }, pn9.b/Z, [x16]\n"
        "add x25, x16, x12, LSL #1\n"
        "add x24, x26, x12\n"
        ".inst 0xa0404740  // ld1w { z0.s-z1.s }, pn9.b/Z, [x26]\n"
        "cmp %x[N], x20\n"
        "add x23, x16, x12\n"
        ".inst 0xa0404730  // ld1w { z16.s-z17.s }, pn9.b/Z, [x25]\n"
        "add x22, x25, x12\n"
        "csel x24, x24, x16, GT\n"
        ".inst 0xa04046f6  // ld1w { z22.s-z23.s }, pn9.b/Z, [x23]\n"
        "mov x20, #0x2\n"
        ".inst 0xa04046d2  // ld1w { z18.s-z19.s }, pn9.b/Z, [x22]\n"
        "mov x11, %x[K]\n"
        ".inst 0xa0404702  // ld1w { z2.s-z3.s }, pn9.b/Z, [x24]\n"
        "msub x21, x15, x20, %x[N]\n"
        "mov x10, %x[A_ptr]\n"
        "lsl x20, %x[K], #0x2\n"
        ".inst 0x25b567f0  // whilelt p8.s, XZR, x21, VLx4\n"
        ".inst 0xc0040e80  // mova za.d[x8, #0], { z20.d-z23.d }\n"
        "cmp x11, #0x4\n"
        ".inst 0xf8b44958  // rprfm pldmany, x20, [x10]\n"
        ".inst 0xc0040e01  // mova za.d[x8, #1], { z16.d-z19.d }\n"
        "addvl x16, x16, #2\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc0040c02  // mova za.d[x8, #2], { z0.d-z3.d }\n"
        "addvl x25, x25, #2\n"
        "addvl x22, x22, #2\n"
        "addvl x26, x26, #2\n"
        "addvl x24, x24, #2\n"
        "ble 18f\n"
        "17:"  // Width 3: Multiply loop: Main loop head
        "whilelt p0.s, XZR, x11\n"
        ".inst 0xa040460d  // ldnt1w { z12.s-z13.s }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        "ld1rqw { z3.s }, p0/Z, [x10]\n"
        "sub x11, x11, #0x4\n"
        "add x10, x10, #0x10\n"
        ".inst 0xa04046ef  // ldnt1w { z14.s-z15.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        "cmp x11, #0x4\n"
        ".inst 0xa0404729  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xa04046cb  // ldnt1w { z10.s-z11.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa0404751  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x26]\n"
        ".inst 0xc1538180  // fmla za.s[x8, 0], { z12.s-z15.s }, z3.s[0]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xa0404713  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xc1538101  // fmla za.s[x8, 1], { z8.s-z11.s }, z3.s[0]\n"
        ".inst 0xa0404609  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04046eb  // ldnt1w { z10.s-z11.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc1538202  // fmla za.s[x8, 2], { z16.s-z19.s }, z3.s[0]\n"
        ".inst 0xa0404731  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xa04046d3  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa0404745  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x26]\n"
        ".inst 0xc1538500  // fmla za.s[x8, 0], { z8.s-z11.s }, z3.s[1]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xa0404707  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xc1538601  // fmla za.s[x8, 1], { z16.s-z19.s }, z3.s[1]\n"
        ".inst 0xa0404609  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04046eb  // ldnt1w { z10.s-z11.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc1538482  // fmla za.s[x8, 2], { z4.s-z7.s }, z3.s[1]\n"
        ".inst 0xa0404731  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xa04046d3  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa0404745  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x26]\n"
        ".inst 0xc1538900  // fmla za.s[x8, 0], { z8.s-z11.s }, z3.s[2]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xa0404707  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xc1538a01  // fmla za.s[x8, 1], { z16.s-z19.s }, z3.s[2]\n"
        ".inst 0xa0404615  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04046f7  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc1538882  // fmla za.s[x8, 2], { z4.s-z7.s }, z3.s[2]\n"
        ".inst 0xa0404739  // ldnt1w { z24.s-z25.s }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xa04046db  // ldnt1w { z26.s-z27.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa0404751  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x26]\n"
        ".inst 0xc1538e80  // fmla za.s[x8, 0], { z20.s-z23.s }, z3.s[3]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xa0404713  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xc1538f01  // fmla za.s[x8, 1], { z24.s-z27.s }, z3.s[3]\n"
        ".inst 0xc1538e02  // fmla za.s[x8, 2], { z16.s-z19.s }, z3.s[3]\n"
        "bgt 17b\n"
        "18:"  // Width 3: Multiply loop: Single iteration only
        "whilelt p0.s, XZR, x11\n"
        ".inst 0xa0404605  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x1\n"
        "ld1rqw { z8.s }, p0/Z, [x10]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04046e7  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xa040473d  // ldnt1w { z28.s-z29.s }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xa04046df  // ldnt1w { z30.s-z31.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa0404755  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x26]\n"
        ".inst 0xc1588080  // fmla za.s[x8, 0], { z4.s-z7.s }, z8.s[0]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xa0404717  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xc1588381  // fmla za.s[x8, 1], { z28.s-z31.s }, z8.s[0]\n"
        ".inst 0xc1588282  // fmla za.s[x8, 2], { z20.s-z23.s }, z8.s[0]\n"
        "ble 19f\n"
        ".inst 0xa040460d  // ldnt1w { z12.s-z13.s }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x1\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04046ef  // ldnt1w { z14.s-z15.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xa0404725  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xa04046c7  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa0404751  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x26]\n"
        ".inst 0xc1588580  // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s[1]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xa0404713  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xc1588481  // fmla za.s[x8, 1], { z4.s-z7.s }, z8.s[1]\n"
        ".inst 0xc1588602  // fmla za.s[x8, 2], { z16.s-z19.s }, z8.s[1]\n"
        "ble 19f\n"
        ".inst 0xa0404601  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x1\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04046e3  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xa040472d  // ldnt1w { z12.s-z13.s }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xa04046cf  // ldnt1w { z14.s-z15.s }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa0404751  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x26]\n"
        ".inst 0xc1588800  // fmla za.s[x8, 0], { z0.s-z3.s }, z8.s[2]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xa0404713  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xc1588981  // fmla za.s[x8, 1], { z12.s-z15.s }, z8.s[2]\n"
        ".inst 0xc1588a02  // fmla za.s[x8, 2], { z16.s-z19.s }, z8.s[2]\n"
        "ble 19f\n"
        ".inst 0xa0404605  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x16]\n"
        ".inst 0xa04046e7  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x23]\n"
        ".inst 0xa040472d  // ldnt1w { z12.s-z13.s }, pn9.b/Z, [x25]\n"
        ".inst 0xa04046cf  // ldnt1w { z14.s-z15.s }, pn9.b/Z, [x22]\n"
        ".inst 0xa0404755  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x26]\n"
        ".inst 0xc1588c80  // fmla za.s[x8, 0], { z4.s-z7.s }, z8.s[3]\n"
        ".inst 0xa0404717  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x24]\n"
        ".inst 0xc1588d81  // fmla za.s[x8, 1], { z12.s-z15.s }, z8.s[3]\n"
        ".inst 0xc1588e82  // fmla za.s[x8, 2], { z20.s-z23.s }, z8.s[3]\n"
        "19:"  // Width 3: Multiply loop: multiply skip
        "tbz %x[flags], #1, 20f\n"
        "add x21, %x[args_ptr], %[offset_min]\n"
        "add x20, %x[args_ptr], %[offset_max]\n"
        ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
        ".inst 0xc0060c2c  // mova { z12.d-z15.d }, za.d[x8, #1]\n"
        "ld1rw { z21.s }, p1/Z, [x21]\n"
        ".inst 0xc0060c50  // mova { z16.d-z19.d }, za.d[x8, #2]\n"
        "ld1rw { z20.s }, p1/Z, [x20]\n"
        ".inst 0xc1b4caa8  // fclamp { z8.s-z11.s }, z21.s, z20.s\n"
        ".inst 0xc1b4caac  // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
        ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
        ".inst 0xa060c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14]\n"
        ".inst 0xa061c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0x4, MUL VL]\n"
        ".inst 0xa062c1d0  // st1w { z16.s-z19.s }, p8, [x14, #0x8, MUL VL]\n"
        "b 21f\n"
        "20:"  // Width 3: No activation
        ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
        ".inst 0xc0060c20  // mova { z0.d-z3.d }, za.d[x8, #1]\n"
        ".inst 0xc0060c50  // mova { z16.d-z19.d }, za.d[x8, #2]\n"
        ".inst 0xa060c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14]\n"
        ".inst 0xa061c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14, #0x4, MUL VL]\n"
        ".inst 0xa062c1d0  // st1w { z16.s-z19.s }, p8, [x14, #0x8, MUL VL]\n"
        "21:"  // Width 3: Output done
        "b 28f\n"
        "22:"  // Width 4
        "add x9, x16, x12, LSL #2\n"
        "cntw x20, ALL, MUL #14\n"
        ".inst 0xa040460c  // ld1w { z12.s-z13.s }, pn9.b/Z, [x16]\n"
        "add x28, x9, x12, LSL #1\n"
        "add x27, x16, x12, LSL #1\n"
        ".inst 0xa0404528  // ld1w { z8.s-z9.s }, pn9.b/Z, [x9]\n"
        "add x26, x28, x12\n"
        "cmp %x[N], x20\n"
        ".inst 0xa0404760  // ld1w { z0.s-z1.s }, pn9.b/Z, [x27]\n"
        "add x25, x16, x12\n"
        "add x24, x27, x12\n"
        ".inst 0xa0404790  // ld1w { z16.s-z17.s }, pn9.b/Z, [x28]\n"
        "add x23, x9, x12\n"
        "csel x26, x26, x16, GT\n"
        ".inst 0xa040472e  // ld1w { z14.s-z15.s }, pn9.b/Z, [x25]\n"
        "mov x20, #0x3\n"
        ".inst 0xa0404702  // ld1w { z2.s-z3.s }, pn9.b/Z, [x24]\n"
        "mov x11, %x[K]\n"
        ".inst 0xa04046ea  // ld1w { z10.s-z11.s }, pn9.b/Z, [x23]\n"
        "msub x21, x15, x20, %x[N]\n"
        "mov x10, %x[A_ptr]\n"
        ".inst 0xa0404752  // ld1w { z18.s-z19.s }, pn9.b/Z, [x26]\n"
        "lsl x20, %x[K], #0x2\n"
        ".inst 0x25b567f0  // whilelt p8.s, XZR, x21, VLx4\n"
        ".inst 0xc0040d80  // mova za.d[x8, #0], { z12.d-z15.d }\n"
        "cmp x11, #0x4\n"
        ".inst 0xf8b44958  // rprfm pldmany, x20, [x10]\n"
        ".inst 0xc0040c01  // mova za.d[x8, #1], { z0.d-z3.d }\n"
        "add x22, x16, x12, LSL #3\n"
        "addvl x16, x16, #2\n"
        ".inst 0xc0040d02  // mova za.d[x8, #2], { z8.d-z11.d }\n"
        "addvl x25, x25, #2\n"
        "addvl x27, x27, #2\n"
        ".inst 0xc0040e03  // mova za.d[x8, #3], { z16.d-z19.d }\n"
        "addvl x24, x24, #2\n"
        "addvl x9, x9, #2\n"
        "addvl x23, x23, #2\n"
        "addvl x28, x28, #2\n"
        "addvl x26, x26, #2\n"
        "ble 24f\n"
        "23:"  // Width 4: Multiply loop: Main loop head
        "whilelt p0.s, XZR, x11\n"
        ".inst 0xa0404609  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        "ld1rqw { z13.s }, p0/Z, [x10]\n"
        "sub x11, x11, #0x4\n"
        "add x10, x10, #0x10\n"
        ".inst 0xa040472b  // ldnt1w { z10.s-z11.s }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        "cmp x11, #0x4\n"
        ".inst 0xa0404765  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x27]\n"
        "addvl x27, x27, #2\n"
        ".inst 0xa0404707  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa0404531  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x9]\n"
        ".inst 0xc15d8100  // fmla za.s[x8, 0], { z8.s-z11.s }, z13.s[0]\n"
        "addvl x9, x9, #2\n"
        ".inst 0xa04046f3  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xa0404781  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x28]\n"
        ".inst 0xc15d8081  // fmla za.s[x8, 1], { z4.s-z7.s }, z13.s[0]\n"
        "addvl x28, x28, #2\n"
        ".inst 0xa0404743  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x26]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xc15d8202  // fmla za.s[x8, 2], { z16.s-z19.s }, z13.s[0]\n"
        ".inst 0xa040461d  // ldnt1w { z28.s-z29.s }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa040473f  // ldnt1w { z30.s-z31.s }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xc15d8003  // fmla za.s[x8, 3], { z0.s-z3.s }, z13.s[0]\n"
        ".inst 0xa0404761  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x27]\n"
        "addvl x27, x27, #2\n"
        ".inst 0xa0404703  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa0404529  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x9]\n"
        ".inst 0xc15d8780  // fmla za.s[x8, 0], { z28.s-z31.s }, z13.s[1]\n"
        "addvl x9, x9, #2\n"
        ".inst 0xa04046eb  // ldnt1w { z10.s-z11.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xa0404791  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x28]\n"
        ".inst 0xc15d8401  // fmla za.s[x8, 1], { z0.s-z3.s }, z13.s[1]\n"
        "addvl x28, x28, #2\n"
        ".inst 0xa0404753  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x26]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xc15d8502  // fmla za.s[x8, 2], { z8.s-z11.s }, z13.s[1]\n"
        ".inst 0xa0404605  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa0404727  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xc15d8603  // fmla za.s[x8, 3], { z16.s-z19.s }, z13.s[1]\n"
        ".inst 0xa0404761  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x27]\n"
        "addvl x27, x27, #2\n"
        ".inst 0xa0404703  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa0404529  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x9]\n"
        ".inst 0xc15d8880  // fmla za.s[x8, 0], { z4.s-z7.s }, z13.s[2]\n"
        "addvl x9, x9, #2\n"
        ".inst 0xa04046eb  // ldnt1w { z10.s-z11.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xa0404791  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x28]\n"
        ".inst 0xc15d8801  // fmla za.s[x8, 1], { z0.s-z3.s }, z13.s[2]\n"
        "addvl x28, x28, #2\n"
        ".inst 0xa0404753  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x26]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xc15d8902  // fmla za.s[x8, 2], { z8.s-z11.s }, z13.s[2]\n"
        ".inst 0xa0404615  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa0404737  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xc15d8a03  // fmla za.s[x8, 3], { z16.s-z19.s }, z13.s[2]\n"
        ".inst 0xa0404779  // ldnt1w { z24.s-z25.s }, pn9.b/Z, [x27]\n"
        "addvl x27, x27, #2\n"
        ".inst 0xa040471b  // ldnt1w { z26.s-z27.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa0404529  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x9]\n"
        ".inst 0xc15d8e80  // fmla za.s[x8, 0], { z20.s-z23.s }, z13.s[3]\n"
        "addvl x9, x9, #2\n"
        ".inst 0xa04046eb  // ldnt1w { z10.s-z11.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xa0404795  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x28]\n"
        ".inst 0xc15d8f01  // fmla za.s[x8, 1], { z24.s-z27.s }, z13.s[3]\n"
        "addvl x28, x28, #2\n"
        ".inst 0xa0404757  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x26]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xc15d8d02  // fmla za.s[x8, 2], { z8.s-z11.s }, z13.s[3]\n"
        ".inst 0xc15d8e83  // fmla za.s[x8, 3], { z20.s-z23.s }, z13.s[3]\n"
        "bgt 23b\n"
        "24:"  // Width 4: Multiply loop: Single iteration only
        "whilelt p0.s, XZR, x11\n"
        ".inst 0xa0404605  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x1\n"
        "ld1rqw { z8.s }, p0/Z, [x10]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa0404727  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xa0404761  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x27]\n"
        "addvl x27, x27, #2\n"
        ".inst 0xa0404703  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa040452d  // ldnt1w { z12.s-z13.s }, pn9.b/Z, [x9]\n"
        ".inst 0xc1588080  // fmla za.s[x8, 0], { z4.s-z7.s }, z8.s[0]\n"
        "addvl x9, x9, #2\n"
        ".inst 0xa04046ef  // ldnt1w { z14.s-z15.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xa0404791  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x28]\n"
        ".inst 0xc1588001  // fmla za.s[x8, 1], { z0.s-z3.s }, z8.s[0]\n"
        "addvl x28, x28, #2\n"
        ".inst 0xa0404753  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x26]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xc1588182  // fmla za.s[x8, 2], { z12.s-z15.s }, z8.s[0]\n"
        ".inst 0xc1588203  // fmla za.s[x8, 3], { z16.s-z19.s }, z8.s[0]\n"
        "ble 25f\n"
        ".inst 0xa040461d  // ldnt1w { z28.s-z29.s }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x1\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa040473f  // ldnt1w { z30.s-z31.s }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xa0404761  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x27]\n"
        "addvl x27, x27, #2\n"
        ".inst 0xa0404703  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa0404525  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x9]\n"
        ".inst 0xc1588780  // fmla za.s[x8, 0], { z28.s-z31.s }, z8.s[1]\n"
        "addvl x9, x9, #2\n"
        ".inst 0xa04046e7  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xa0404791  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x28]\n"
        ".inst 0xc1588401  // fmla za.s[x8, 1], { z0.s-z3.s }, z8.s[1]\n"
        "addvl x28, x28, #2\n"
        ".inst 0xa0404753  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x26]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xc1588482  // fmla za.s[x8, 2], { z4.s-z7.s }, z8.s[1]\n"
        ".inst 0xc1588603  // fmla za.s[x8, 3], { z16.s-z19.s }, z8.s[1]\n"
        "ble 25f\n"
        ".inst 0xa040461d  // ldnt1w { z28.s-z29.s }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x1\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa040473f  // ldnt1w { z30.s-z31.s }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xa040476d  // ldnt1w { z12.s-z13.s }, pn9.b/Z, [x27]\n"
        "addvl x27, x27, #2\n"
        ".inst 0xa040470f  // ldnt1w { z14.s-z15.s }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa0404521  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x9]\n"
        ".inst 0xc1588b80  // fmla za.s[x8, 0], { z28.s-z31.s }, z8.s[2]\n"
        "addvl x9, x9, #2\n"
        ".inst 0xa04046e3  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xa0404791  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x28]\n"
        ".inst 0xc1588981  // fmla za.s[x8, 1], { z12.s-z15.s }, z8.s[2]\n"
        "addvl x28, x28, #2\n"
        ".inst 0xa0404753  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x26]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xc1588802  // fmla za.s[x8, 2], { z0.s-z3.s }, z8.s[2]\n"
        ".inst 0xc1588a03  // fmla za.s[x8, 3], { z16.s-z19.s }, z8.s[2]\n"
        "ble 25f\n"
        ".inst 0xa0404605  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x16]\n"
        ".inst 0xa0404727  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x25]\n"
        ".inst 0xa040476d  // ldnt1w { z12.s-z13.s }, pn9.b/Z, [x27]\n"
        ".inst 0xa040470f  // ldnt1w { z14.s-z15.s }, pn9.b/Z, [x24]\n"
        ".inst 0xa0404535  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x9]\n"
        ".inst 0xc1588c80  // fmla za.s[x8, 0], { z4.s-z7.s }, z8.s[3]\n"
        ".inst 0xa04046f7  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x23]\n"
        ".inst 0xa0404791  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x28]\n"
        ".inst 0xc1588d81  // fmla za.s[x8, 1], { z12.s-z15.s }, z8.s[3]\n"
        ".inst 0xa0404753  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x26]\n"
        ".inst 0xc1588e82  // fmla za.s[x8, 2], { z20.s-z23.s }, z8.s[3]\n"
        ".inst 0xc1588e03  // fmla za.s[x8, 3], { z16.s-z19.s }, z8.s[3]\n"
        "25:"  // Width 4: Multiply loop: multiply skip
        "tbz %x[flags], #1, 26f\n"
        "add x21, %x[args_ptr], %[offset_min]\n"
        "add x20, %x[args_ptr], %[offset_max]\n"
        ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
        ".inst 0xc0060c20  // mova { z0.d-z3.d }, za.d[x8, #1]\n"
        "ld1rw { z21.s }, p1/Z, [x21]\n"
        ".inst 0xc0060c4c  // mova { z12.d-z15.d }, za.d[x8, #2]\n"
        "ld1rw { z20.s }, p1/Z, [x20]\n"
        ".inst 0xc0060c70  // mova { z16.d-z19.d }, za.d[x8, #3]\n"
        ".inst 0xc1b4caa4  // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
        ".inst 0xc1b4caa0  // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
        ".inst 0xc1b4caac  // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
        ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
        ".inst 0xa060c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14]\n"
        ".inst 0xa061c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14, #0x4, MUL VL]\n"
        ".inst 0xa062c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0x8, MUL VL]\n"
        ".inst 0xa063c1d0  // st1w { z16.s-z19.s }, p8, [x14, #0xc, MUL VL]\n"
        "addvl x14, x14, #16\n"
        "b 27f\n"
        "26:"  // Width 4: No activation
        ".inst 0xc0060c0c  // mova { z12.d-z15.d }, za.d[x8, #0]\n"
        ".inst 0xc0060c20  // mova { z0.d-z3.d }, za.d[x8, #1]\n"
        ".inst 0xc0060c50  // mova { z16.d-z19.d }, za.d[x8, #2]\n"
        ".inst 0xc0060c64  // mova { z4.d-z7.d }, za.d[x8, #3]\n"
        ".inst 0xa060c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14]\n"
        ".inst 0xa061c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14, #0x4, MUL VL]\n"
        ".inst 0xa062c5d0  // st1w { z16.s-z19.s }, pn9.b, [x14, #0x8, MUL VL]\n"
        ".inst 0xa063c1c4  // st1w { z4.s-z7.s }, p8, [x14, #0xc, MUL VL]\n"
        "addvl x14, x14, #16\n"
        "27:"  // Width 4: Output done
        "subs x13, x13, #0x4\n"
        "mov x16, x22\n"
        "sub %x[N], %x[N], x15, LSL #2\n"
        "bgt 4b\n"
        "28:"  // Exit
        ".inst 0xd503467f  // SMSTOP\n"
        : [N] "+&r"(N)
        : [A_ptr] "r"(A_ptr), [B_ptr] "r"(B_ptr), [K] "r"(K), [args_ptr] "r"(&ka), [flags] "r"(flags),
          [offset_max] "I"(offsetof(KernelArgs, maxval)), [offset_min] "I"(offsetof(KernelArgs, minval)),
          [output_ptr] "r"(output_ptr)
        : "cc", "memory", "p0", "p1", "p10", "p11", "p12", "p13", "p14", "p15", "p2", "p3", "p4", "p5", "p6", "p7",
          "p8", "p9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
          "x27", "x28", "x8", "x9", "z0", "z1", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19",
          "z2", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z3", "z30", "z31", "z4", "z5",
          "z6", "z7", "z8", "z9");
}

#endif  // Architectural features check.
